#######################################
#This file conducts the analysis for "Education, Early-life, and
#Political Participation: New Evidence from a Sibling Model." 
#The analysis depends on several variables that are available
#only through special request from the WLS administrators. For
#more information on the WLS and to request access to the
#restricted data see this link:
#https://www.ssc.wisc.edu/wlsresearch/data/

rm(list =ls())
setwd("V:/hauser/users/bmjones3/Latest Analysis/education analysis")
###set up data
##get small version of Catalist that has only the variables we use
##if working with the public data, some of the variable names may need to be changed
##consult the documentation from the WLS
source("set up data from private.R")

data$ed4 <- 2
w <- which(data$educ04 < 12) 
data$ed4[w] <- 1
w <- which(data$educ04 > 12)
data$ed4[w] <- 3
w <- which(data$educ04 > 15)
data$ed4[w] <- 4

####dummy for college grad+
data$college_grad <- 0
w <- which(data$educ04 > 15)
data$college_grad[w] <- 1

###Turnout in 2008
tab <- with(data, table(ed4, cat08))
tab/rowSums(tab)

tab <- with(data, table(ed4, cat10))
tab/rowSums(tab)

tab <- with(data, table(ed4, cat12))
tab/rowSums(tab)


###Parent's education
data$par_ed4 <- 2
data$parEduc <- apply(cbind(data$educDad, data$educMom), 1, max, na.rm=TRUE)
w <- which(data$parEduc < 12)
data$par_ed4[w] <- 1
w <- which(data$parEduc > 12)
data$par_ed4[w] <- 3
w <- which(data$parEduc > 15)
data$par_ed4[w] <- 4

###Turnout in 2008
tab <- with(data, table(par_ed4, cat08))
tab/rowSums(tab)

tab <- with(data, table(par_ed4, cat10))
tab/rowSums(tab)

tab <- with(data, table(par_ed4, cat12))
tab/rowSums(tab)

data$abs_party <- abs(data$party04_fill)
data$abs_ideo <- abs(data$ideo04_fill)
data$sub.id <- substr(unique(data$id), 1, 6)
sdat <- data.frame(id = unique(data$sub.id))
grads <- data[which(data$grad == 1),]
sibs <- data[which(data$grad == 0),]

vars <- c("age", "networth04_10", "college_grad", "educ04", "ed4",
          "hui04", "cat08", "cat10", "cat12", "walking_speed", 
          "iq", "cog04", "male", "o04",
          "c04","e04","a04","n04", "abs_party", "abs_ideo")
for (j in vars) {
  sdat[[j]] <- grads[[j]] - sibs[[j]]
}
sdat$id.num <- as.numeric(as.character(sdat$id))


###Leave pair out cross validation
#iterate through sibling pairs leaving one out each time in a model
#without an intercept to assess model fit 
#compare leave-pair-out cross-validation sib to non-sib models

get_scaled_data <- function(res, held.out, vars) {
  newdata = array(NA, c(length(held.out), length(vars)))
  dimnames(newdata) <- list(held.out, vars)
  
  for (j in vars) {
    if (is.element(j, names(data))) newdata[,j] <- data[held.out,j]
    if (!is.element(j, names(data))) {
      nm <- gsub("scale\\(", "", j)
      nm <- gsub("\\)", "", nm)
      
      mu <- attr(res$model[[j]], 'scaled:center')
      sd <- attr(res$model[[j]], 'scaled:scale')
      newdata[,j] <- (data[held.out,nm]-mu)/sd
    }
  }
  return( (newdata) )
}

kfold_cv <- function(mod1, mod2, inds, k = 10,
                     data) {
  res0 <- glm(mod1, data = data[inds,],
              family = 'binomial')
  ids <- names(table(data$sub.id[inds]))
  
  res <- array(NA, c(length(coef(res0))-1, k, 2))
  dimnames(res) <- list(
      names(coef(res0)[2:length(coef(res0))]), 1:k, 
    c("standard", "FE"))
  
  pred <- array(NA, c(length(inds), 2))
  rownames(pred) <- inds
  
  vars <- dimnames(res)[[1]]

  
  folds <- sample(1:k, size = length(ids), replace = TRUE)
  
  
  data$folds <- NA
  
  for (j in 1:k) {
    w <- which(folds == j)
    f <- which(is.element(data$sub.id, ids[w]))
    data$folds[f] <- j
  }
  
  ###Sibling vote data
  ag <- aggregate(y ~ sub.id, data = data[inds,], FUN = sum)
  data$sib_vote <- NA
  for (j in 1:nrow(ag)) {
    w <- which(data$sub.id == ag[j,1])
    data$sib_vote[w] <- ag[j,2]
  }
  
  for (j in 1:k) {
    held.out <- which(data$folds == j)
    kept.in <- which(is.element(data$folds, (1:k)[-j]))
    
    res1 <- glm(mod1, data = data[kept.in,],
                #family = binomial(link = "logit"),
                x = TRUE, y = TRUE)
    pred[as.character(held.out),1] <- data$y[held.out] - predict(res1, 
                                newdata = data[held.out,], 
                                type = 'response')
    
    res[,j,1] <- coef(res1)[vars]
    
    res2 <- speedglm(mod2, data = data[kept.in,],
                #family = binomial(link = "logit"),
                intercept = FALSE)
    newdat <- get_scaled_data(res1, held.out, vars)
    
    fam = coef(res2)[grep("sub.id", names(coef(res2)))]
    fam = c(0, .5, 1)[data$sib_vote[held.out]+1]
    
    xb <- newdat %*% coef(res2)[vars] + fam
    
    pred[as.character(held.out),2] <- data$y[held.out] - xb #1/(1+exp(-xb))
    
    res[,j,2] <- coef(res2)[vars]
    
  }
  
  ##calculate rmse
  rmse <- c( sqrt(mean(pred[,1]^2)),
             sqrt(mean(pred[,2]^2)))
  
  return( list(coefficients = res, cv_pred = pred,
               rmse = rmse))
}


########################Running models with college grad+ rather than years of educ
library(stargazer)
library(speedglm)
############
data$age2 <- data$age*data$age
mod1 <- y ~ college_grad + male + scale(age) + scale(iq) +
  scale(o04) + scale(c04) + scale(e04) + scale(a04) + scale(n04)
mod2 <- y ~ college_grad + male + scale(age) + scale(iq) +
  scale(o04) + scale(c04) + scale(e04) + scale(a04) + scale(n04) + sub.id-1
mod3 <- y ~ college_grad + male + scale(age) + scale(iq) +
  scale(abs_party) + scale(abs_ideo) +
  scale(o04) + scale(c04) + scale(e04) + scale(a04) + scale(n04)
mod4 <- y ~ college_grad + male + scale(age) + scale(iq) +
  scale(abs_party) + scale(abs_ideo) +
  scale(o04) + scale(c04) + scale(e04) + scale(a04) + scale(n04) + sub.id-1


nmod <- y ~ college_grad + male + scale(age) + scale(iq) + scale(abs_party) + 
  scale(abs_ideo) + o04 + c04 + e04 + a04 + n04 + id.num + cat08 + cat10 + cat12

lab <- c("College grad", "Male", "Age", "Age2", 
         "IQ", "Party attachment", "Ideological extremity", 
         "Openness","Conscientiousness","Extraversion",
         "Agreeableness", "Neuroticism")


ALL.RES <- list()
for (F in c("cat08","cat10","cat12")) {
  res <- list()
  nm <- F
  data$y <- data[[nm]]
  sdat$y <- sdat[[nm]]
  
  nres <- lm(nmod , data = sdat)
  ids <- c(paste(nres$model$id.num, "G", sep = ""),
           paste(nres$model$id.num, "S", sep = ""))
  
  inds <- which(is.element(data$id, ids))
  
  
  res[[1]] <- glm(mod1, data = data[inds,])#,
                  #family = binomial(link = "logit"))
  res[[2]] <- speedglm(mod2, data = data[inds,],
                       #family = binomial(link = "logit"),
                       intercept = FALSE)
  cv.err1 <- kfold_cv(mod1, mod2, inds, k = 10, data = data)
  res[[1]]$rmse = cv.err1$rmse[1]
  res[[2]]$rmse = cv.err1$rmse[2]
  
  res[[3]] <- glm(mod3, data = data[inds,])#,
                  #family = binomial(link = "logit"))
  res[[4]] <- speedglm(mod4, data = data[inds,],
                       #family = binomial(link = "logit"),
                       intercept = FALSE)
  cv.err2 <-   kfold_cv(mod3, mod4, inds, k = 10, data = data)
  res[[3]]$rmse = cv.err2$rmse[1]
  res[[4]]$rmse = cv.err2$rmse[2]
  
  ALL.RES[[F]] <- res
}

print.res <- function(mod, digits = 3,
                      container = container,
                      col) {
  res <- summary(mod)$coefficients
  for (j in 1:nrow(container)) {
    w <- which(names(mod$coefficients) == rownames(container)[j])
    if (length(w)==0) next
    
    if (class(res[w,4]) == "factor") p = as.numeric(as.character(res[w,4]))
    if (class(res[w,4]) == "numeric") p = res[w,4]
    star <- ifelse(p < .05, "*", "")
    if (class(res[w,1])=='factor') beta = as.numeric(as.character(res[w,1]))
    if (class(res[w,1])=='numeric') beta = res[w,1]

    container[j,col] <- paste(round(beta, digits), star, sep = '')
    
    if (round(beta,digits)==0) container[j,col] <- paste(beta, star, sep= "")
    container[j+1,col] <- paste("'(", round(res[w,2], digits), ")", sep = '')
  }
  res <- summary(mod)
  container['n',col] <- length(mod$residuals)
  container['rmse',col] <- round(mod$rmse, 3)
  return(container[,col])
}

to.print <- c(Intercept = "(Intercept)", "",
              "College grad" = "college_grad", "",
              Male = "male", "",
              "Age (scaled)" = "scale(age)", "",
              "IQ (scaled)" = "scale(iq)","",
              "Partisanship (folded)" = "scale(abs_party)","",
              "Ideological extremity" = "scale(abs_ideo)","",
              "O (scaled)" = "scale(o04)","",
              "C (scaled)" = "scale(c04)","",
              "E (scaled)" = "scale(e04)","",
              "A (scaled)" = "scale(a04)","",
              "N (scaled)" = "scale(n04)","")

reg_res08 <- array('', c(length(to.print)+3, 4))
rownames(reg_res08) <- c(to.print, c("n", "rmse", 
                                   "rmse (cross-validated)"))
reg_res10 <- reg_res12 <- reg_res08

for (j in 1:4) reg_res08[,j] <- print.res(ALL.RES[[1]][[j]], 
                                        container = reg_res08, col = j)
for (j in 1:4) reg_res10[,j] <- print.res(ALL.RES[[2]][[j]], 
                                          container = reg_res10, col = j)
for (j in 1:4) reg_res12[,j] <- print.res(ALL.RES[[3]][[j]], 
                                          container = reg_res12, col = j)



#######################Pooled model

dat08 <- data[inds, c("cat08", "college_grad", "male","age", "age2",
                      "iq","abs_party","abs_ideo","o04","c04",
                      "e04","a04","n04","sub.id")]
names(dat08)[1] = "y"
dat08$year = "2008"

dat10 <- data[inds, c("cat10", "college_grad", "male","age", "age2",
                      "iq","abs_party","abs_ideo","o04","c04",
                      "e04","a04","n04","sub.id")]
names(dat10)[1] = "y"
dat10$year = "2010"

dat12 <- data[inds, c("cat12", "college_grad", "male","age", "age2",
                      "iq","abs_party","abs_ideo","o04","c04",
                      "e04","a04","n04","sub.id")]
names(dat12)[1] = "y"
dat12$year = "2012"

pooled <- rbind(dat08, dat10, dat12)
pool_res <- list()

mod1_pooled <- update(mod1, "~ . + year")
mod2_pooled <- update(mod2, "~ . + year")
mod3_pooled <- update(mod3, "~ . + year")
mod4_pooled <- update(mod4, "~ . + year")

pool_res[[1]] <- glm(mod1_pooled, data = pooled)#,
        #family = binomial(link = "logit"))
pool_res[[2]] <- speedglm(mod2_pooled, data = pooled,
                     #family = binomial(link = "logit"),
                     intercept = FALSE)

pool_res[[3]] <- glm(mod3_pooled, data = pooled)#,
          #family = binomial(link = "logit"))
pool_res[[4]] <- speedglm(mod4_pooled, data = pooled,
                          #family = binomial(link = "logit"),
                          intercept = FALSE)


print.res <- function(mod, digits = 3,
                      container = container,
                      col) {
  res <- summary(mod)$coefficients
  for (j in 1:nrow(container)) {
    w <- which(names(mod$coefficients) == rownames(container)[j])
    if (length(w)==0) next
    
    if (class(res[w,4]) == "factor") p = as.numeric(as.character(res[w,4]))
    if (class(res[w,4]) == "numeric") p = res[w,4]
    star <- ifelse(p < .05, "*", "")
    if (class(res[w,1])=='factor') beta = as.numeric(as.character(res[w,1]))
    if (class(res[w,1])=='numeric') beta = res[w,1]
    container[j,col] <- paste(round(beta, digits), star, sep = '')
    container[j+1,col] <- paste("'(", round(res[w,2], digits), ")", sep = '')
  }
  res <- summary(mod)
  container['n',col] <- length(mod$residuals)
  return(container[,col])
}

to.print <- c(Intercept = "(Intercept)", "",
  "College grad" = "college_grad", "",
  Male = "male", "",
  "Age (scaled)" = "scale(age)", "",
  "IQ (scaled)" = "scale(iq)","",
  "Partisanship (folded)" = "scale(abs_party)","",
  "Ideological extremity" = "scale(abs_ideo)","",
  "O (scaled)" = "scale(o04)","",
  "C (scaled)" = "scale(c04)","",
  "E (scaled)" = "scale(e04)","",
  "A (scaled)" = "scale(a04)","",
  "N (scaled)" = "scale(n04)","",
  "2010" = "year2010", "", "2012" = "year2012", "")

pool_reg <- array('', c(length(to.print)+1, 4))
rownames(pool_reg) <- c(to.print, c("n"))

for (j in 1:4) pool_reg[,j] <- print.res(pool_res[[j]], 
                                          container = pool_reg, col = j)


###############Bootstrapped estimation
boot_res <- array(NA, c(1000, length(to.print), 4))
dimnames(boot_res) <- list(NULL, to.print, NULL)
sampled <- names(table(data$sub.id[inds]))

for (j in 1:1000) {
  ##sample with replacement from sibling pairs
  bsamp <- sample(sampled, size = length(sampled), replace = TRUE)
  
  ##set up new data
  tab <- table(bsamp)
  w <- which(tab == 1)
  
  bdat <- data[which(is.element(data$sub.id, names(w))),]
  reps <- names(table(tab)[-1])
  for (k in reps) {
    w <- which(tab == k)
    
    for (i in 1:k) {
      bdat2 <- data[which(is.element(data$sub.id, names(w))),]
      bdat2$sub.id = paste(bdat2$sub.id, i)
      bdat <- rbind(bdat, bdat2)
    }

  }
  
  bdat08 <- bdat[inds, c("cat08", "college_grad", "male","age", "age2",
                        "iq","abs_party","abs_ideo","o04","c04",
                        "e04","a04","n04","sub.id")]
  names(bdat08)[1] = "y"
  bdat08$year = "2008"
  
  bdat10 <- bdat[inds, c("cat10", "college_grad", "male","age", "age2",
                        "iq","abs_party","abs_ideo","o04","c04",
                        "e04","a04","n04","sub.id")]
  names(bdat10)[1] = "y"
  bdat10$year = "2010"
  
  bdat12 <- bdat[inds, c("cat12", "college_grad", "male","age", "age2",
                        "iq","abs_party","abs_ideo","o04","c04",
                        "e04","a04","n04","sub.id")]
  names(bdat12)[1] = "y"
  bdat12$year = "2012"
  
  bpooled <- rbind(bdat08, bdat10, bdat12)
  bpool_res <- list()
  
  bpool_res[[1]] <- glm(mod1_pooled, data = bpooled)#,
  #family = binomial(link = "logit"))
  coef <- coef(bpool_res[[1]])
  boot_res[j,,1] <- coef[to.print]
  
  bpool_res[[2]] <- speedglm(mod2_pooled, data = bpooled,
                            #family = binomial(link = "logit"),
                            intercept = FALSE)
  coef <- coef(bpool_res[[2]])
  boot_res[j,,2] <- coef[to.print]
  
  bpool_res[[3]] <- glm(mod3_pooled, data = bpooled)#,
  #family = binomial(link = "logit"))
  coef <- coef(bpool_res[[3]])
  boot_res[j,,3] <- coef[to.print]
  
  bpool_res[[4]] <- speedglm(mod4_pooled, data = bpooled,
                            #family = binomial(link = "logit"),
                            intercept = FALSE)
  coef <- coef(bpool_res[[4]])
  boot_res[j,,4] <- coef[to.print]
  
  
}

k <- 3
cbind(round(apply(boot_res[,,k], 2, mean), 3),
round(apply(boot_res[,,k], 2, sd), 3))

apply(boot_res[,,k], 2, mean)

####Plotting education effect different years in the same chart

###First panel

ed.beta <- c(rm.str(reg_res08[1,1]),
             rm.str(reg_res10[1,1]),
             rm.str(reg_res12[1,1]))
ed.se <- c(rm.str(reg_res08[2,1]),
           rm.str(reg_res10[2,1]),
           rm.str(reg_res12[2,1]))
fe.beta <- c(rm.str(reg_res08[1,2]),
             rm.str(reg_res10[1,2]),
             rm.str(reg_res12[1,2]))
fe.se <- c(rm.str(reg_res08[2,2]),
           rm.str(reg_res10[2,2]),
           rm.str(reg_res12[2,2]))

par(mar = c(4,.1,2,.2))
par(mfrow = c(1, 2))
plot(0, 0, pch = '', xlim = c(-.15,.15), ylim = c(.5,3.5),
     axes = FALSE, xlab = 'Effect size', ylab = '',
    main = "Models 1 & 2")
axis(1, at = c(-.1, -.05, 0, .05, .1))
abline(v = 0)

rm.str <- function(s) {
  as.numeric(gsub("[^0-9\\.\\-]", "", s))
}


years <- c(2008, 2010, 2012)
for (k in 1:3) {
  mu <- ed.beta[k]
  se <- ed.se[k]
  
  sig <- ifelse(sign(mu - 1.96*se) == sign(mu + 1.96*se), 2, 1)
  
  segments( x0 = mu - 1.96*se, x1 = mu + 1.96*se,
            y0 = k+.15, y1 = k+.15, lwd = 1)
  points( mu, k+.15, pch = 20, cex = 1)
  
  
  
  text(mu+2.4*se, k, years[k], cex = .67, font = sig)
  
}

for (k in 1:3) {
  mu <- fe.beta[k]
  se <- fe.se[k]
  
  sig <- ifelse(sign(mu - 1.96*se) == sign(mu + 1.96*se), 2, 1)
  
  segments( x0 = mu - 1.96*se, x1 = mu + 1.96*se,
            y0 = k-.15, y1 = k-.15, lwd = 1, lty = 2)
  points( mu, k-.15, pch = 1, cex = 1)
  
}
legend('topleft', c("Standard model", "Sibling fixed-effects"),
       lty = c(1, 2), pch = c(20, 1), cex = .75, bty = 'n')

###Second panel
 
  par(mar = c(4,.2,2,.1))
  plot(0, 0, pch = '', xlim = c(-.15,.15), ylim = c(.5,3.5),
       axes = FALSE, xlab = 'Effect size', ylab = '',
       main = "Models 3 & 4")
  axis(1, at = c(-.1,-.05,0,.05,.1))
  abline(v = 0)

  
  ed.beta <- c(rm.str(reg_res08[1,3]),
               rm.str(reg_res10[1,3]),
               rm.str(reg_res12[1,3]))
  ed.se <- c(rm.str(reg_res08[2,3]),
             rm.str(reg_res10[2,3]),
             rm.str(reg_res12[2,3]))
  fe.beta <- c(rm.str(reg_res08[1,4]),
               rm.str(reg_res10[1,4]),
               rm.str(reg_res12[1,4]))
  fe.se <- c(rm.str(reg_res08[2,4]),
             rm.str(reg_res10[2,4]),
             rm.str(reg_res12[2,4]))
  
  years <- c(2008, 2010, 2012)
  for (k in 1:3) {
    mu <- ed.beta[k]
    se <- ed.se[k]
    
    sig <- ifelse(sign(mu - 1.96*se) == sign(mu + 1.96*se), 2, 1)
    
    segments( x0 = mu - 1.96*se, x1 = mu + 1.96*se,
              y0 = k+.15, y1 = k+.15, lwd = 1)
    points( mu, k+.15, pch = 20, cex = 1)
    
    
    
    text(mu+2.4*se, k, years[k], cex = .67, font = sig)
    
  }
  
  for (k in 1:3) {
    mu <- fe.beta[k]
    se <- fe.se[k]
    
    sig <- ifelse(sign(mu - 1.96*se) == sign(mu + 1.96*se), 2, 1)
    
    segments( x0 = mu - 1.96*se, x1 = mu + 1.96*se,
              y0 = k-.15, y1 = k-.15, lwd = 1, lty = 2)
    points( mu, k-.15, pch = 1, cex = 1)
    
  }

  